{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-19T07:36:18.035966Z",
     "start_time": "2017-08-19T07:36:18.030689Z"
    }
   },
   "source": [
    "# Data Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "becareful editing this file since this will be the notebook for initial data preparation for all the models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:44.034853Z",
     "start_time": "2017-09-23T21:03:28.258946Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "import re\n",
    "import collections\n",
    "import itertools\n",
    "import bcolz\n",
    "import pickle\n",
    "sys.path.append('../lib')\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gc\n",
    "import random\n",
    "import smart_open\n",
    "import h5py\n",
    "import csv\n",
    "\n",
    "import tensorflow as tf\n",
    "import gensim\n",
    "\n",
    "import datetime as dt\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "random_state_number = 967898"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:44.631799Z",
     "start_time": "2017-09-23T21:03:44.036529Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/gpu:0', '/gpu:1']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.python.client import device_lib\n",
    "def get_available_gpus():\n",
    "    local_device_protos = device_lib.list_local_devices()\n",
    "    return [x.name for x in local_device_protos if x.device_type == 'GPU']\n",
    "\n",
    "config = tf.ConfigProto()\n",
    "config.gpu_options.allow_growth=True\n",
    "sess = tf.Session(config=config)\n",
    "get_available_gpus()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:44.675034Z",
     "start_time": "2017-09-23T21:03:44.633384Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using matplotlib backend: TkAgg\n",
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']\n",
      "`%matplotlib` prevents importing * from pylab and numpy\n",
      "  \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
     ]
    }
   ],
   "source": [
    "%pylab\n",
    "%matplotlib inline\n",
    "%load_ext autoreload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:44.680000Z",
     "start_time": "2017-09-23T21:03:44.676537Z"
    }
   },
   "outputs": [],
   "source": [
    "pd.options.mode.chained_assignment = None\n",
    "pd.options.display.max_columns = 999\n",
    "color = sns.color_palette()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.625420Z",
     "start_time": "2017-09-23T21:03:44.681376Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train and Test variants shape :  (3321, 4) (5668, 3)\n",
      "Train and Test text shape :  (3321, 2) (5668, 2)\n"
     ]
    }
   ],
   "source": [
    "train_variants_df = pd.read_csv(\"dataset/stage1/training_variants\")\n",
    "test_variants_df = pd.read_csv(\"dataset/stage1/test_variants\")\n",
    "\n",
    "train_text_df = pd.read_csv(\"dataset/stage1/training_text\", sep=\"\\|\\|\", engine='python', \n",
    "                            header=None, skiprows=1, names=[\"ID\",\"Text\"])\n",
    "\n",
    "test_text_df = pd.read_csv(\"dataset/stage1/test_text\", sep=\"\\|\\|\", engine='python', \n",
    "                           header=None, skiprows=1, names=[\"ID\",\"Text\"])\n",
    "print(\"Train and Test variants shape : \",train_variants_df.shape, test_variants_df.shape)\n",
    "print(\"Train and Test text shape : \",train_text_df.shape, test_text_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.646443Z",
     "start_time": "2017-09-23T21:03:46.627549Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Class</th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>FAM58A</td>\n",
       "      <td>Truncating Mutations</td>\n",
       "      <td>1</td>\n",
       "      <td>Cyclin-dependent kinases (CDKs) regulate a var...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>CBL</td>\n",
       "      <td>W802*</td>\n",
       "      <td>2</td>\n",
       "      <td>Abstract Background  Non-small cell lung canc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>CBL</td>\n",
       "      <td>Q249E</td>\n",
       "      <td>2</td>\n",
       "      <td>Abstract Background  Non-small cell lung canc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>CBL</td>\n",
       "      <td>N454D</td>\n",
       "      <td>3</td>\n",
       "      <td>Recent evidence has demonstrated that acquired...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>CBL</td>\n",
       "      <td>L399V</td>\n",
       "      <td>4</td>\n",
       "      <td>Oncogenic mutations in the monomeric Casitas B...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID    Gene             Variation  Class  \\\n",
       "0   0  FAM58A  Truncating Mutations      1   \n",
       "1   1     CBL                 W802*      2   \n",
       "2   2     CBL                 Q249E      2   \n",
       "3   3     CBL                 N454D      3   \n",
       "4   4     CBL                 L399V      4   \n",
       "\n",
       "                                                Text  \n",
       "0  Cyclin-dependent kinases (CDKs) regulate a var...  \n",
       "1   Abstract Background  Non-small cell lung canc...  \n",
       "2   Abstract Background  Non-small cell lung canc...  \n",
       "3  Recent evidence has demonstrated that acquired...  \n",
       "4  Oncogenic mutations in the monomeric Casitas B...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.merge(train_variants_df, train_text_df, how='left', on='ID')\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.669039Z",
     "start_time": "2017-09-23T21:03:46.648284Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>ACSL4</td>\n",
       "      <td>R570S</td>\n",
       "      <td>2. This mutation resulted in a myeloproliferat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>NAGLU</td>\n",
       "      <td>P521L</td>\n",
       "      <td>Abstract The Large Tumor Suppressor 1 (LATS1)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>PAH</td>\n",
       "      <td>L333F</td>\n",
       "      <td>Vascular endothelial growth factor receptor (V...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>ING1</td>\n",
       "      <td>A148D</td>\n",
       "      <td>Inflammatory myofibroblastic tumor (IMT) is a ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>TMEM216</td>\n",
       "      <td>G77A</td>\n",
       "      <td>Abstract Retinoblastoma is a pediatric retina...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID     Gene Variation                                               Text\n",
       "0   0    ACSL4     R570S  2. This mutation resulted in a myeloproliferat...\n",
       "1   1    NAGLU     P521L   Abstract The Large Tumor Suppressor 1 (LATS1)...\n",
       "2   2      PAH     L333F  Vascular endothelial growth factor receptor (V...\n",
       "3   3     ING1     A148D  Inflammatory myofibroblastic tumor (IMT) is a ...\n",
       "4   4  TMEM216      G77A   Abstract Retinoblastoma is a pediatric retina..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df = pd.merge(test_variants_df, test_text_df, how='left', on='ID')\n",
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.681660Z",
     "start_time": "2017-09-23T21:03:46.670868Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NO MISSING DATA ..... dream come true :)\n"
     ]
    }
   ],
   "source": [
    "import missingno as msno\n",
    "missing_val_cols = train_variants_df.columns[train_variants_df.isnull().any()].tolist()\n",
    "sorted(missing_val_cols)\n",
    "if len(missing_val_cols) != 0:\n",
    "    msno.bar(train_variants_df[missing_val_cols],figsize=(20,8),fontsize=12,labels=True,)\n",
    "else:\n",
    "    print(\"NO MISSING DATA ..... dream come true :)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pre processing data frames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "all processing is done by fasttext, you just sit and enjoy\n",
    "\n",
    "create a text file with all the text data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.934459Z",
     "start_time": "2017-09-23T21:03:46.683115Z"
    }
   },
   "outputs": [],
   "source": [
    "%autoreload\n",
    "\n",
    "# used to identify the words inside braces for identifying some unwanted strings\n",
    "# from nltk.tokenize import SExprTokenizer\n",
    "\n",
    "# used to make sentences\n",
    "from nltk.tokenize import PunktSentenceTokenizer\n",
    "\n",
    "#used to collect words in the sentences\n",
    "from utils import custom_word_tokenizer, apply_custom_regx\n",
    "from nltk import sent_tokenize, word_tokenize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cleaning characters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T07:19:22.047218Z",
     "start_time": "2017-09-23T07:19:22.040905Z"
    }
   },
   "source": [
    "![ascii-cheat-sheet](imgs/ascii-cheat-sheet.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:46.945015Z",
     "start_time": "2017-09-23T21:03:46.936109Z"
    }
   },
   "outputs": [],
   "source": [
    "#using just printable characters\n",
    "undesirable_ascii_characters = list(range(32))\n",
    "undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer\n",
    "undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "perform unicode transformation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.214636Z",
     "start_time": "2017-09-23T21:03:46.946666Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Text = train_df.Text.apply(lambda s: s.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore'))\n",
    "test_df.Text = test_df.Text.apply(lambda s: s.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.738892Z",
     "start_time": "2017-09-23T21:03:50.216134Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Text = train_df.Text.apply(lambda s: str(s).translate(undesirable_charmap))\n",
    "test_df.Text = test_df.Text.apply(lambda s: str(s).translate(undesirable_charmap))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.749806Z",
     "start_time": "2017-09-23T21:03:50.740517Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Gene = train_df.Gene.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))\n",
    "test_df.Gene = test_df.Gene.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.761605Z",
     "start_time": "2017-09-23T21:03:50.751179Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Gene = train_df.Gene.apply(lambda s: str(s).translate(undesirable_charmap))\n",
    "test_df.Gene = test_df.Gene.apply(lambda s: str(s).translate(undesirable_charmap))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.772871Z",
     "start_time": "2017-09-23T21:03:50.763207Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Variation = train_df.Variation.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))\n",
    "test_df.Variation = test_df.Variation.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:03:50.784803Z",
     "start_time": "2017-09-23T21:03:50.774076Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Variation = train_df.Variation.apply(lambda s: str(s).translate(undesirable_charmap))\n",
    "test_df.Variation = test_df.Variation.apply(lambda s: str(s).translate(undesirable_charmap))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## custom cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:04:42.347926Z",
     "start_time": "2017-09-23T21:03:50.786105Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Text = train_df.Text.apply(lambda s: apply_custom_regx(s))\n",
    "test_df.Text = test_df.Text.apply(lambda s: apply_custom_regx(s))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T08:38:54.290498Z",
     "start_time": "2017-09-23T08:38:54.282364Z"
    }
   },
   "source": [
    "based on manual reveiw on data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:04:43.121520Z",
     "start_time": "2017-09-23T21:04:42.349231Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Text = train_df.Text.apply(lambda s: s.replace('\\\\t', ''))\n",
    "test_df.Text = test_df.Text.apply(lambda s: s.replace('\\\\t', ''))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## tokening"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:06:46.597065Z",
     "start_time": "2017-09-23T21:04:43.122854Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df['Sentences'] = train_df.Text.apply(lambda s: sent_tokenize(s))\n",
    "test_df['Sentences'] = test_df.Text.apply(lambda s: sent_tokenize(s))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:16:03.294319Z",
     "start_time": "2017-09-23T21:06:46.598402Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Sentences = train_df.Sentences.apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])\n",
    "test_df.Sentences = test_df.Sentences.apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:16:03.912842Z",
     "start_time": "2017-09-23T21:16:03.295728Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Gene = train_df.Gene.apply(lambda s: word_tokenize(s))\n",
    "test_df.Gene = test_df.Gene.apply(lambda s: word_tokenize(s))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:16:04.538498Z",
     "start_time": "2017-09-23T21:16:03.914468Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.Variation = train_df.Variation.apply(lambda s: word_tokenize(s))\n",
    "test_df.Variation = test_df.Variation.apply(lambda s: word_tokenize(s))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cleaning in word level"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "removing special characters from the beginning and the end\n",
    "eg.words:\n",
    "\n",
    "'.black-color',\n",
    " '0.0136*',\n",
    " '-c-kit',\n",
    " '..4',\n",
    " '.01this',\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:08.993534Z",
     "start_time": "2017-09-23T21:16:04.540211Z"
    }
   },
   "outputs": [],
   "source": [
    "clean_start_end = lambda doc: [[re.sub('^[^a-zA-z0-9]*|[^a-zA-Z0-9]*$','',w) for w in sent] for sent in doc]\n",
    "train_df.Sentences = train_df.Sentences.apply(clean_start_end)\n",
    "test_df.Sentences = test_df.Sentences.apply(clean_start_end)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## saving data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:09.005192Z",
     "start_time": "2017-09-23T21:18:08.994982Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df.drop([\"Text\"], axis=1, inplace=True)\n",
    "test_df.drop([\"Text\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "save the pandas processed frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:09.183595Z",
     "start_time": "2017-09-23T21:18:09.006769Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Class</th>\n",
       "      <th>Sentences</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[fam58a]</td>\n",
       "      <td>[truncating, mutations]</td>\n",
       "      <td>1</td>\n",
       "      <td>[[cyclin-dependent, kinases, , cdks, , regulat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[w802*]</td>\n",
       "      <td>2</td>\n",
       "      <td>[[abstract, background, non-small, cell, lung,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[q249e]</td>\n",
       "      <td>2</td>\n",
       "      <td>[[abstract, background, non-small, cell, lung,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[n454d]</td>\n",
       "      <td>3</td>\n",
       "      <td>[[recent, evidence, has, demonstrated, that, a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[l399v]</td>\n",
       "      <td>4</td>\n",
       "      <td>[[oncogenic, mutations, in, the, monomeric, ca...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID      Gene                Variation  Class  \\\n",
       "0   0  [fam58a]  [truncating, mutations]      1   \n",
       "1   1     [cbl]                  [w802*]      2   \n",
       "2   2     [cbl]                  [q249e]      2   \n",
       "3   3     [cbl]                  [n454d]      3   \n",
       "4   4     [cbl]                  [l399v]      4   \n",
       "\n",
       "                                           Sentences  \n",
       "0  [[cyclin-dependent, kinases, , cdks, , regulat...  \n",
       "1  [[abstract, background, non-small, cell, lung,...  \n",
       "2  [[abstract, background, non-small, cell, lung,...  \n",
       "3  [[recent, evidence, has, demonstrated, that, a...  \n",
       "4  [[oncogenic, mutations, in, the, monomeric, ca...  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:09.377264Z",
     "start_time": "2017-09-23T21:18:09.184900Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Sentences</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[acsl4]</td>\n",
       "      <td>[r570s]</td>\n",
       "      <td>[[2, this, mutation, resulted, in, a, myelopro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[naglu]</td>\n",
       "      <td>[p521l]</td>\n",
       "      <td>[[abstract, the, large, tumor, suppressor, 1, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[pah]</td>\n",
       "      <td>[l333f]</td>\n",
       "      <td>[[vascular, endothelial, growth, factor, recep...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[ing1]</td>\n",
       "      <td>[a148d]</td>\n",
       "      <td>[[inflammatory, myofibroblastic, tumor, , imt,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[tmem216]</td>\n",
       "      <td>[g77a]</td>\n",
       "      <td>[[abstract, retinoblastoma, is, a, pediatric, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID       Gene Variation                                          Sentences\n",
       "0   0    [acsl4]   [r570s]  [[2, this, mutation, resulted, in, a, myelopro...\n",
       "1   1    [naglu]   [p521l]  [[abstract, the, large, tumor, suppressor, 1, ...\n",
       "2   2      [pah]   [l333f]  [[vascular, endothelial, growth, factor, recep...\n",
       "3   3     [ing1]   [a148d]  [[inflammatory, myofibroblastic, tumor, , imt,...\n",
       "4   4  [tmem216]    [g77a]  [[abstract, retinoblastoma, is, a, pediatric, ..."
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:29.932362Z",
     "start_time": "2017-09-23T21:18:09.380123Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2910: PerformanceWarning: \n",
      "your performance may suffer as PyTables will pickle object types that it cannot\n",
      "map directly to c-types [inferred_type->mixed,key->block1_values] [items->['Gene', 'Variation', 'Sentences']]\n",
      "\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "store = pd.HDFStore('processed/stage1/data_frames.h5')\n",
    "store['train_df'] = train_df\n",
    "store['test_df'] = test_df\n",
    "store.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load data frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:49.329211Z",
     "start_time": "2017-09-23T21:18:29.933905Z"
    }
   },
   "outputs": [],
   "source": [
    "store = pd.HDFStore('processed/stage1/data_frames.h5')\n",
    "train_df = store['train_df']\n",
    "test_df = store['test_df']\n",
    "store.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:49.692917Z",
     "start_time": "2017-09-23T21:18:49.330969Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Class</th>\n",
       "      <th>Sentences</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[fam58a]</td>\n",
       "      <td>[truncating, mutations]</td>\n",
       "      <td>1</td>\n",
       "      <td>[[cyclin-dependent, kinases, , cdks, , regulat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[w802*]</td>\n",
       "      <td>2</td>\n",
       "      <td>[[abstract, background, non-small, cell, lung,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[q249e]</td>\n",
       "      <td>2</td>\n",
       "      <td>[[abstract, background, non-small, cell, lung,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[n454d]</td>\n",
       "      <td>3</td>\n",
       "      <td>[[recent, evidence, has, demonstrated, that, a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[cbl]</td>\n",
       "      <td>[l399v]</td>\n",
       "      <td>4</td>\n",
       "      <td>[[oncogenic, mutations, in, the, monomeric, ca...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID      Gene                Variation  Class  \\\n",
       "0   0  [fam58a]  [truncating, mutations]      1   \n",
       "1   1     [cbl]                  [w802*]      2   \n",
       "2   2     [cbl]                  [q249e]      2   \n",
       "3   3     [cbl]                  [n454d]      3   \n",
       "4   4     [cbl]                  [l399v]      4   \n",
       "\n",
       "                                           Sentences  \n",
       "0  [[cyclin-dependent, kinases, , cdks, , regulat...  \n",
       "1  [[abstract, background, non-small, cell, lung,...  \n",
       "2  [[abstract, background, non-small, cell, lung,...  \n",
       "3  [[recent, evidence, has, demonstrated, that, a...  \n",
       "4  [[oncogenic, mutations, in, the, monomeric, ca...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variation</th>\n",
       "      <th>Sentences</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[acsl4]</td>\n",
       "      <td>[r570s]</td>\n",
       "      <td>[[2, this, mutation, resulted, in, a, myelopro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[naglu]</td>\n",
       "      <td>[p521l]</td>\n",
       "      <td>[[abstract, the, large, tumor, suppressor, 1, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[pah]</td>\n",
       "      <td>[l333f]</td>\n",
       "      <td>[[vascular, endothelial, growth, factor, recep...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[ing1]</td>\n",
       "      <td>[a148d]</td>\n",
       "      <td>[[inflammatory, myofibroblastic, tumor, , imt,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[tmem216]</td>\n",
       "      <td>[g77a]</td>\n",
       "      <td>[[abstract, retinoblastoma, is, a, pediatric, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID       Gene Variation                                          Sentences\n",
       "0   0    [acsl4]   [r570s]  [[2, this, mutation, resulted, in, a, myelopro...\n",
       "1   1    [naglu]   [p521l]  [[abstract, the, large, tumor, suppressor, 1, ...\n",
       "2   2      [pah]   [l333f]  [[vascular, endothelial, growth, factor, recep...\n",
       "3   3     [ing1]   [a148d]  [[inflammatory, myofibroblastic, tumor, , imt,...\n",
       "4   4  [tmem216]    [g77a]  [[abstract, retinoblastoma, is, a, pediatric, ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(train_df.head())\n",
    "display(test_df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## generate wordidx, vocab_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:52.978060Z",
     "start_time": "2017-09-23T21:18:49.694142Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "286977"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))\n",
    "train_words = list(itertools.chain.from_iterable(train_words))\n",
    "len(set(train_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:59.434232Z",
     "start_time": "2017-09-23T21:18:52.979807Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "343861"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))\n",
    "test_words = list(itertools.chain.from_iterable(test_words))\n",
    "len(set(test_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:59.441096Z",
     "start_time": "2017-09-23T21:18:59.435669Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3018"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))\n",
    "len(train_variations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:59.448642Z",
     "start_time": "2017-09-23T21:18:59.442508Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5634"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))\n",
    "len(test_variations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:59.454693Z",
     "start_time": "2017-09-23T21:18:59.449998Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "264"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))\n",
    "len(train_genes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:18:59.461439Z",
     "start_time": "2017-09-23T21:18:59.456034Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1397"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))\n",
    "len(test_genes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "vocab_words and vocab_wordidx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.013345Z",
     "start_time": "2017-09-23T21:18:59.462787Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "352215"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_words = list(set(train_words) | set(test_words) | train_variations | test_variations | train_genes | test_genes)\n",
    "len(vocab_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "add extra words such as start/end of sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.019045Z",
     "start_time": "2017-09-23T21:19:04.014670Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "352220"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_words.append(\"<UNK>\")\n",
    "vocab_words.append(\"<SOSent>\")\n",
    "vocab_words.append(\"<EOSent>\")\n",
    "vocab_words.append(\"<SODoc>\")\n",
    "vocab_words.append(\"<EODoc>\")\n",
    "len(vocab_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.120757Z",
     "start_time": "2017-09-23T21:19:04.020218Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "352220"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_wordidx = {w:i for i,w in enumerate(vocab_words)}\n",
    "len(vocab_wordidx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.284138Z",
     "start_time": "2017-09-23T21:19:04.121867Z"
    }
   },
   "outputs": [],
   "source": [
    "with open('processed/stage1/vocab_words_wordidx.pkl', 'wb') as f:\n",
    "    pickle.dump((vocab_words, vocab_wordidx), f, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.401372Z",
     "start_time": "2017-09-23T21:19:04.285814Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(352220, 352220)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:\n",
    "    (vocab_words1, vocab_wordidx1) = pickle.load(f)\n",
    "len(vocab_words1), len(vocab_wordidx1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:25:21.141719Z",
     "start_time": "2017-09-23T21:25:21.134756Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(36141974, 54273869)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_words), len(test_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:19:04.410479Z",
     "start_time": "2017-09-23T21:19:04.407370Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "str"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(test_words[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T21:25:26.149248Z",
     "start_time": "2017-09-23T21:25:23.957107Z"
    }
   },
   "outputs": [],
   "source": [
    "with open('processed/stage1/all_text.txt', 'w') as f:\n",
    "    f.write(\" \".join(train_words))\n",
    "    f.write(\" \")\n",
    "    f.write(\" \".join(test_words))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## verifying words and text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:57:39.231727Z",
     "start_time": "2017-09-23T20:57:38.984916Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'.hepatic',\n",
       " '.h3k27ac',\n",
       " 'ln2/',\n",
       " \"'.\",\n",
       " 'powerpointxrcc2/',\n",
       " '10p12.33p12.2',\n",
       " '~500x',\n",
       " 'p.gln76x',\n",
       " 'anchorage-',\n",
       " '.references',\n",
       " 'bm/',\n",
       " '.setd2/set2',\n",
       " '.single-nucleus',\n",
       " 'pylori_',\n",
       " '.polyclonal',\n",
       " 'cspv',\n",
       " '.4575dela',\n",
       " '.tsq-vantage',\n",
       " '1.3-',\n",
       " 'ma-',\n",
       " '.677',\n",
       " '.large',\n",
       " '.078',\n",
       " '.cofilin',\n",
       " 'u-',\n",
       " '.constitutional',\n",
       " 'repoch+',\n",
       " '84.',\n",
       " '-gcaatatcagccttaggtgcggctc-3',\n",
       " '.fletcher',\n",
       " 'pan-',\n",
       " '.yoshimoto',\n",
       " '4.3425',\n",
       " '//www.uib.no/aasland/chrab/',\n",
       " '-cd4',\n",
       " \"'intermediate\",\n",
       " '/nf-b',\n",
       " '3inst',\n",
       " '//exac.broadinstitute.org/',\n",
       " '4001600',\n",
       " '3538.',\n",
       " 'g0-',\n",
       " '-d910a1/-d910a',\n",
       " 'mcf10atp53+/+',\n",
       " '55+a171',\n",
       " 'anti-ib-',\n",
       " \"'kinome\",\n",
       " '|il',\n",
       " 'q572*',\n",
       " '.anti-bach1',\n",
       " 'tcga-az-6598-01',\n",
       " '.mzb1',\n",
       " '81.24',\n",
       " '.sozzi',\n",
       " '.termination',\n",
       " '.very',\n",
       " '-tropomyosin',\n",
       " '1.0.',\n",
       " '4552045880',\n",
       " '.cullins-2',\n",
       " '.km12',\n",
       " 'osterix+',\n",
       " '.engelman',\n",
       " '90.',\n",
       " '//www.ncbi.nlm.nih.gov/ncicgap',\n",
       " 'mmnr~.~',\n",
       " 'abcg2/',\n",
       " '//www.broadinstitute.org/gatk/',\n",
       " 'jsl~~~~~~~~',\n",
       " '.n-t',\n",
       " 'chk2-',\n",
       " 'p.y261*',\n",
       " '.isoelectric',\n",
       " '.abundant',\n",
       " '.856c',\n",
       " '.enw',\n",
       " 'fgfr1/hr+',\n",
       " '.twenty-four-hour',\n",
       " 'sds-',\n",
       " '-r505c',\n",
       " '.ls',\n",
       " '.n107i',\n",
       " '.1453',\n",
       " 'tagagcatgacccatgag',\n",
       " '-p65',\n",
       " '7900.',\n",
       " '*li7',\n",
       " 'p190s',\n",
       " 'mcdb153++',\n",
       " '-thio',\n",
       " '-quinolin-4-yloxy',\n",
       " '4-6.',\n",
       " '.asymptotic',\n",
       " '.mda-mb-231',\n",
       " ',18have',\n",
       " '-translocation',\n",
       " 'smith-water-',\n",
       " '.deletion',\n",
       " '1216.',\n",
       " '.5ca',\n",
       " 'val-158-',\n",
       " '.b-raf',\n",
       " '.friedlander',\n",
       " 'c.-1114-',\n",
       " ',51',\n",
       " 'a*0303n',\n",
       " '.-h.',\n",
       " 'spectrumgreen-',\n",
       " '.representative',\n",
       " '140.',\n",
       " 'p.v717a',\n",
       " '~62',\n",
       " '/extracellular',\n",
       " 'raf/',\n",
       " '18,609',\n",
       " '2/423',\n",
       " '.fact',\n",
       " 'k248e',\n",
       " '.kda',\n",
       " 'asxi1+/',\n",
       " '/www.nchgr.nih.gov/',\n",
       " '.tg101348-mediated',\n",
       " '.15,28',\n",
       " 'del.755759*',\n",
       " '-10.28',\n",
       " '++++++',\n",
       " '.idh1mut',\n",
       " '.a11031',\n",
       " '32,200',\n",
       " '~282kb',\n",
       " 'q354e',\n",
       " 'y-~hain.~.~',\n",
       " 'whsc1/',\n",
       " 'brip1-',\n",
       " '.film',\n",
       " '835g',\n",
       " 'methionine/',\n",
       " '.zr-75-1',\n",
       " '*0b',\n",
       " '._862+',\n",
       " 'ik..',\n",
       " '0.390.61',\n",
       " '.existing',\n",
       " '.full43k',\n",
       " '.amount',\n",
       " '-,3',\n",
       " 'p1/p4',\n",
       " 'cs27',\n",
       " 'phosphatidylinositol-',\n",
       " '//evs.gs.washington.edu/evs',\n",
       " 'g309e-',\n",
       " '.448c',\n",
       " '1.85.',\n",
       " 'er-',\n",
       " '|cbf',\n",
       " '-h2axdeveloped',\n",
       " '-0.547eleven',\n",
       " 'nct01384253',\n",
       " 'ofpole-exo*',\n",
       " 'agacag/gtatgg',\n",
       " 'pbs/',\n",
       " '.psrc',\n",
       " 'gtctttacccattatttatagg',\n",
       " 'tggccacaaagttcttgg',\n",
       " '32-34.',\n",
       " '460.6',\n",
       " 't573s',\n",
       " 'c.463g',\n",
       " 'withtgf-',\n",
       " '.1795',\n",
       " '.role',\n",
       " '+-+-',\n",
       " '+nk',\n",
       " '6p12.1q11.1',\n",
       " '78a.u.',\n",
       " '+81-22-7178142',\n",
       " '.revisiting',\n",
       " '.wimmer',\n",
       " '18522.',\n",
       " '.immunoblots',\n",
       " '.details',\n",
       " '.plx-4032',\n",
       " '-15',\n",
       " 'plc-',\n",
       " '.maternal',\n",
       " '.ntrk1',\n",
       " '460-',\n",
       " 'gtgtag/gtgtct',\n",
       " 'p.v736a',\n",
       " '.4,11',\n",
       " 'gtaaattcc',\n",
       " '.calpains',\n",
       " '.asp2215',\n",
       " '.mfe-280',\n",
       " '.rfpvhl-positive',\n",
       " 'v-ras-',\n",
       " '/1kdx',\n",
       " '.au',\n",
       " 'probasin/',\n",
       " 'haploinsuffi-',\n",
       " 'mutation.',\n",
       " '.alectinib',\n",
       " '.dysregulated',\n",
       " '*conserved',\n",
       " '100t',\n",
       " 'c.442_443delinst',\n",
       " 'p.w.',\n",
       " 'stk11/lkb1+/',\n",
       " '.alkaline',\n",
       " '.hs1',\n",
       " '.lymphoma/thymoma',\n",
       " '-jq133',\n",
       " 'residues/',\n",
       " '.6kras',\n",
       " 'trypsin-giemsa-',\n",
       " '.703',\n",
       " '10a-10b-',\n",
       " '59.',\n",
       " '-gccagcattttagcattacttc-30',\n",
       " 'lats2-',\n",
       " 'ctccttggttcagattctgc',\n",
       " '.tazarotene-induced',\n",
       " '.alex',\n",
       " '3199',\n",
       " 'tgccttgtatcgaatgaa',\n",
       " 'cosm51966',\n",
       " '12.5-',\n",
       " '//expasy.org/tools/sim-prot.html',\n",
       " '.communication',\n",
       " 'desm.',\n",
       " '2.788',\n",
       " 'pc9-bimi2/',\n",
       " '-substituted',\n",
       " '.l979p',\n",
       " '.banding',\n",
       " 'c.892_893insc',\n",
       " '.plasminogen',\n",
       " '.md',\n",
       " '.s6c',\n",
       " '.nicd',\n",
       " '.concordance',\n",
       " '.staber',\n",
       " '161:267-80.',\n",
       " 'a06',\n",
       " '.bromodeoxyuridine',\n",
       " '.abbreviations',\n",
       " '3,3-',\n",
       " '.mfs2',\n",
       " 'heavy-',\n",
       " 'controls*',\n",
       " 's477f',\n",
       " 'tgcaaa/gtaagt',\n",
       " 'flk2+',\n",
       " '.eker',\n",
       " '.scr',\n",
       " '.gilliland',\n",
       " 'klf5*',\n",
       " 'ch11:47.1-47.3',\n",
       " '1-cyclopentyl-n-',\n",
       " '6,049ivs6+7ga',\n",
       " '-granulocyte',\n",
       " '.predicts',\n",
       " 'mutations/',\n",
       " '1,088a622v',\n",
       " '.human',\n",
       " '9.073',\n",
       " '~64',\n",
       " '.glucose',\n",
       " '//www-p53.iarc.fr',\n",
       " '.anti-as160',\n",
       " 'h.',\n",
       " '.5329a',\n",
       " '/ncr-balb/c',\n",
       " '/4p16',\n",
       " '-1h-indazole-4-carboxamide',\n",
       " 'bl41_haeatopoie',\n",
       " '.e2419bottom',\n",
       " '.jab13',\n",
       " 'gtatgtttg',\n",
       " '0.533*',\n",
       " '.baby',\n",
       " '.dgc',\n",
       " 'functions*',\n",
       " '.y297a',\n",
       " 'rg-5t',\n",
       " '5.5.3.',\n",
       " '.ha-fancg',\n",
       " '.inter-observation',\n",
       " '.er',\n",
       " \"'one\",\n",
       " 'brca2flox/-',\n",
       " '.nakanishi',\n",
       " 'p.g824d',\n",
       " 'r-1-',\n",
       " '-shcontrol',\n",
       " '.n676',\n",
       " 'tcga-fs-a4fc-06',\n",
       " 's287y',\n",
       " 'cd19+',\n",
       " '.ron',\n",
       " '.frap',\n",
       " '498.',\n",
       " '-egf',\n",
       " '24312435.',\n",
       " '-rip',\n",
       " 'mutl+',\n",
       " '774.',\n",
       " '+664',\n",
       " '.high-resolution',\n",
       " '.1389',\n",
       " '.peroxide',\n",
       " '.only',\n",
       " 'caagttgagtccttgcagctg',\n",
       " 'c.3226c',\n",
       " 'bcr-abl1-t315i-',\n",
       " '/thra',\n",
       " 'na/naovk18',\n",
       " '.hcceosin',\n",
       " 'p.t854i',\n",
       " '.ktn3379',\n",
       " '.within',\n",
       " '.baltimore',\n",
       " '.milestones',\n",
       " 'fancd2-',\n",
       " '.trancriptional',\n",
       " '.aurora1',\n",
       " '.overexpression',\n",
       " '.inoverall',\n",
       " '.1321155111i15',\n",
       " '.duplicate',\n",
       " '..5d',\n",
       " 'nlm.nih.gov/igblast/',\n",
       " 'bcrabl+',\n",
       " '//www.interscience.wiley.com/jpages/0022-3417/',\n",
       " '.shoc2s2g',\n",
       " '.cd5',\n",
       " '~730',\n",
       " '2.96107s1101n',\n",
       " '0.899',\n",
       " 'nb0175',\n",
       " '.1514g',\n",
       " '.proliferating',\n",
       " '-stands',\n",
       " '.apoptotic',\n",
       " '1997.',\n",
       " '-glycerophosphate',\n",
       " '.p-kit',\n",
       " '.106',\n",
       " '.liposome',\n",
       " 'e8.2',\n",
       " '3.2.1.',\n",
       " '/phospho-rela',\n",
       " 'rp11174b22',\n",
       " '.7i',\n",
       " '145.',\n",
       " 'finger-',\n",
       " '..55',\n",
       " '.trisomy',\n",
       " '.3,13',\n",
       " '859a',\n",
       " '-polar',\n",
       " '.results',\n",
       " '.partial',\n",
       " '27:602613.',\n",
       " '.p-y',\n",
       " 'm1043t',\n",
       " 'm-category',\n",
       " 'r-ice+',\n",
       " '.notablythat',\n",
       " 't3/',\n",
       " '47.81.6',\n",
       " '.5x',\n",
       " '71229',\n",
       " '5-aa-',\n",
       " 'brca1/16',\n",
       " '.non',\n",
       " '.subclones',\n",
       " 'm/55',\n",
       " '-trr',\n",
       " '.chemiluminescence',\n",
       " '810.',\n",
       " '.howevermean',\n",
       " '.evidence',\n",
       " '.insummarize',\n",
       " 'dmba-',\n",
       " '209288',\n",
       " '.tbp',\n",
       " '574pt',\n",
       " '.cos1',\n",
       " '.expert',\n",
       " '.using',\n",
       " '.thr323',\n",
       " '.urea',\n",
       " 'k-rasg12dsmad4+/',\n",
       " '.n581d',\n",
       " '*based',\n",
       " '.shifts',\n",
       " 'fgfr3-',\n",
       " \"'2i-labeled\",\n",
       " '.oltipraz',\n",
       " '.rosenfeld',\n",
       " '.tr-i',\n",
       " '-tubulin',\n",
       " 'ex7',\n",
       " '..3c',\n",
       " '.arntflox/flox',\n",
       " '+1+2/-12',\n",
       " '-hcl',\n",
       " 'c.533a',\n",
       " '.neoplasia',\n",
       " 'y552_w556del',\n",
       " 'given.a*0104n',\n",
       " '1:6400.',\n",
       " '3*',\n",
       " '.narumiya',\n",
       " '.gtf',\n",
       " '.e-mexp-2197',\n",
       " '9,27.',\n",
       " '~barshop',\n",
       " '.os',\n",
       " 'msi/p53-',\n",
       " '.confocal',\n",
       " '.flanking',\n",
       " '.557',\n",
       " '.probes',\n",
       " '.ar-fl',\n",
       " '3/435',\n",
       " '0.6*',\n",
       " 'repeat-',\n",
       " '.mmc',\n",
       " '.resu',\n",
       " '.8d',\n",
       " 'l869r/',\n",
       " '/phenylalanine',\n",
       " '.smad5',\n",
       " 'vggrtmlpyrwmppesi',\n",
       " '.sumoylation',\n",
       " '.macrocephaly',\n",
       " '-gacactctcagcatggacga-30',\n",
       " 'lox-',\n",
       " '.098',\n",
       " '.s39',\n",
       " '-12.6',\n",
       " 'e004',\n",
       " '.cooperating',\n",
       " '558563',\n",
       " 'period-',\n",
       " '.myc-induced',\n",
       " 'rars-t.',\n",
       " 'muts-',\n",
       " '.s5eof',\n",
       " '.bioactive',\n",
       " 'stem-',\n",
       " '3.3.1.',\n",
       " 'bt474/',\n",
       " '.760g',\n",
       " '.bertholon',\n",
       " ':801812',\n",
       " '//doi.org/10.1371/journal.pbio.0050109.g003in',\n",
       " '-vector',\n",
       " '.mek1-c121s',\n",
       " '9.80',\n",
       " '.hisd30n',\n",
       " 'fanca/',\n",
       " '.5,9',\n",
       " 'smarcb1/',\n",
       " '.acini',\n",
       " '.n-tad',\n",
       " '.fgfr2c278f',\n",
       " '.tam',\n",
       " '.averages',\n",
       " 'milligen/',\n",
       " '.lymphocyte',\n",
       " '11p14',\n",
       " 'nlower/',\n",
       " '3,266',\n",
       " '.cell-adhesion',\n",
       " '~140,000',\n",
       " '.ets1',\n",
       " '.trfa',\n",
       " 'c.182dela',\n",
       " 'vegf-d.',\n",
       " '.025additional',\n",
       " '//fold-x.emblheidelberg.de',\n",
       " 'gtgagttgg',\n",
       " '.high-fidelity',\n",
       " '386.',\n",
       " '.cnas',\n",
       " '1061c',\n",
       " '.v224a',\n",
       " '.high-molecular',\n",
       " 'dbwu*',\n",
       " 'ccr=',\n",
       " '-neomycin-herpes',\n",
       " '.progestagens',\n",
       " '.remodelling',\n",
       " '.hbz',\n",
       " 'only.thumbnail',\n",
       " ':10714',\n",
       " '69.',\n",
       " '1482570115112300nm_080676nm_207359',\n",
       " 'mll1+/+',\n",
       " '20/143',\n",
       " '-by',\n",
       " '.8,40',\n",
       " 'dqa1*0501+',\n",
       " '.mcns',\n",
       " '-g2101aexpressing',\n",
       " '.designations',\n",
       " 'rp11512i24',\n",
       " '//mordred.bioc.cam.ac.uk/~sdm/sdm.php',\n",
       " '.lncap/ar',\n",
       " 'cd44+',\n",
       " '.s6e',\n",
       " '.glomeruli',\n",
       " '~800,000',\n",
       " '-hemagglutinin',\n",
       " '149.',\n",
       " '.me-pcrs',\n",
       " '550558',\n",
       " 'brca1/23',\n",
       " 'taatag/caaatg',\n",
       " ':csb',\n",
       " '.maximal',\n",
       " '.127',\n",
       " '.brightfield',\n",
       " '3261232866',\n",
       " '.13paraffin-embedded',\n",
       " '//imagej.nih.gov/ij/',\n",
       " '1.58/2.62',\n",
       " '.smad3/',\n",
       " 'chr1:78193604.',\n",
       " 'p300+/+',\n",
       " '.less',\n",
       " '.0070*',\n",
       " '.hpb-all',\n",
       " '0.08811',\n",
       " '+der',\n",
       " '0.34.7',\n",
       " '.libraries',\n",
       " '5/435',\n",
       " '206.',\n",
       " 'p.met1775arg',\n",
       " '-11',\n",
       " '-5.2',\n",
       " '//chromium.liacs.nl/lovd2/colon_cancer/',\n",
       " '-cateninof',\n",
       " '.9,11,12',\n",
       " '.sos',\n",
       " 'lyn-',\n",
       " '.twin',\n",
       " 'g1913/',\n",
       " 'feb.',\n",
       " '.thisis',\n",
       " '/pax8-ppar1',\n",
       " 'h250fs',\n",
       " '.cts',\n",
       " '|u',\n",
       " '.keeping',\n",
       " '.overproduction',\n",
       " '.rok-',\n",
       " '*average',\n",
       " '6.6079',\n",
       " '/c',\n",
       " 'nm_031965',\n",
       " 'ig-',\n",
       " '-ctgccccggttcatcctgatggagctcatggcg-30',\n",
       " '/cd13',\n",
       " '.simple',\n",
       " '1715',\n",
       " '.c-pegfr',\n",
       " 'iden-',\n",
       " 'sequence-',\n",
       " '|-sandwich',\n",
       " '//www.aocstudy.org/',\n",
       " '.left',\n",
       " '//',\n",
       " '4.2.',\n",
       " '.pax5-etv6',\n",
       " 'n756da',\n",
       " 'tcga-bt-a0yx-01',\n",
       " '+/mld2',\n",
       " ',700',\n",
       " '.vincent',\n",
       " '-0.17',\n",
       " '.boxes',\n",
       " '.5115a',\n",
       " '.allogeneic',\n",
       " '0.5.9.',\n",
       " '.whenever',\n",
       " '.postnatal',\n",
       " '/ca\\\\',\n",
       " '46,47.',\n",
       " '-branched',\n",
       " 'cbl-y371h-',\n",
       " 'c.1970dupa',\n",
       " '30-86',\n",
       " '.rev-cdk12',\n",
       " 'er+/pr+',\n",
       " '-157',\n",
       " '10595.',\n",
       " '.wortmannin',\n",
       " '.s1a',\n",
       " '.dicer',\n",
       " '.-tubulin',\n",
       " '.endoscopic',\n",
       " '.underlying',\n",
       " '7.1489615',\n",
       " 'e163fsx213',\n",
       " '.heterotrimeric',\n",
       " '.two-side',\n",
       " '.exposure',\n",
       " 'tp53/tsc2/',\n",
       " '.never-smokers',\n",
       " '.tpgs',\n",
       " 'r258fs',\n",
       " '//www.bloodjournal.org/content/bloodjournal/103/6/2019/f2.medium.gif',\n",
       " '.oligo',\n",
       " '-512-',\n",
       " '=0.038',\n",
       " 'males/females',\n",
       " 'cctctatag',\n",
       " '-lats1',\n",
       " '+/severe',\n",
       " 'immuno-',\n",
       " 'wild-',\n",
       " '.e2',\n",
       " 'v343a',\n",
       " '.four-micrometre',\n",
       " '.cytospin',\n",
       " 'oligonucleotide/',\n",
       " '-89',\n",
       " '.tgof',\n",
       " '.ets1/pax5',\n",
       " 'p16ink4a-/-/p19arf-/-',\n",
       " '.pxs',\n",
       " '.conclusionsour',\n",
       " '//www.rbej.com/content/8/1/61/',\n",
       " '.076mpfs',\n",
       " ':22412252',\n",
       " 'p.gln2325*',\n",
       " '*stages',\n",
       " '-g364r',\n",
       " '201566_x_at',\n",
       " '-cherry',\n",
       " 'mkn1*',\n",
       " 'sequencing*',\n",
       " 'v3-7ii',\n",
       " '.tonic',\n",
       " '.ann',\n",
       " '1617.',\n",
       " '74/m',\n",
       " '.ret',\n",
       " 'bimi2+/+',\n",
       " '_7p_',\n",
       " 'dbf2-',\n",
       " 'ptch1+/',\n",
       " '~stopped',\n",
       " 'mkn45-',\n",
       " '-trcp.2.2',\n",
       " 'aminotrans-',\n",
       " '-17',\n",
       " 'cd36+',\n",
       " '28/crc',\n",
       " '.pb-implicated',\n",
       " '.megakaryocytes',\n",
       " '.nvp-tae684',\n",
       " '.cbte',\n",
       " '~450',\n",
       " '5,541i1929v',\n",
       " '.maintenance',\n",
       " '.cem',\n",
       " ':lys2',\n",
       " '+501',\n",
       " 'p.ser764thrfsx53',\n",
       " 'akt1-',\n",
       " 'min/',\n",
       " '.815',\n",
       " 'g577a',\n",
       " '/distilled',\n",
       " 'e022',\n",
       " '.interpretation',\n",
       " 'ccctccagcacacatgcatgtaccg',\n",
       " '.the',\n",
       " '.m',\n",
       " '.625c',\n",
       " 'lo/',\n",
       " 'score*',\n",
       " '0.809202',\n",
       " '281.',\n",
       " '.graphpad',\n",
       " '.pax8',\n",
       " '*10a',\n",
       " 'c.1806_1809del',\n",
       " '.critial',\n",
       " '.cdk4/cdk6',\n",
       " 'ccactggagttccttaaag',\n",
       " 'l771lb',\n",
       " '.brip1',\n",
       " '.cancerassay',\n",
       " '458-',\n",
       " 'rp180k22',\n",
       " 'www.umd.be/brca2/',\n",
       " 'pdgfr/',\n",
       " 'a295t',\n",
       " '//www.ncbi.nlm.nih.gov/1.terms',\n",
       " 'il7/',\n",
       " 'melan-a+',\n",
       " 'a79t',\n",
       " '0.400.53',\n",
       " '-phosphotyrosine',\n",
       " '.sepharose',\n",
       " '.259consistent',\n",
       " 'fgfr2b-',\n",
       " '1.00nras',\n",
       " '9.71.',\n",
       " '.fixed-effects',\n",
       " '68.',\n",
       " 'p.a859t',\n",
       " 'hn-k-rasg12dsmad4+/',\n",
       " '4.38e-03',\n",
       " 'ivs17',\n",
       " '.firelut',\n",
       " 'jak2wt-',\n",
       " 'e233g',\n",
       " '201c',\n",
       " 'fii-',\n",
       " '.implementation',\n",
       " 'a*0311n',\n",
       " '-cateninlanes',\n",
       " '.droplet',\n",
       " '0.231.2.',\n",
       " '.2368a',\n",
       " '+m',\n",
       " '.smo-d473g',\n",
       " '.loxp',\n",
       " ',1002',\n",
       " '.genome-wide',\n",
       " '.epigenetic',\n",
       " '0.004-2.74',\n",
       " 'gastric/',\n",
       " '-taaatcataagaaattcg-',\n",
       " '.18,19are',\n",
       " '224560_at',\n",
       " '.haplotype',\n",
       " '.hypothesized',\n",
       " '.terminated',\n",
       " '5-aggtaagggccatctgaaaact-3',\n",
       " 'rs143479220',\n",
       " '.rptpp',\n",
       " '6-8.',\n",
       " '.org=human',\n",
       " 'mef/',\n",
       " '.secondary',\n",
       " '67.22',\n",
       " '~11',\n",
       " '22372251',\n",
       " '++/',\n",
       " '~4-fold',\n",
       " '214,215,216,217.',\n",
       " \"'seed\",\n",
       " '266.',\n",
       " '-sh2',\n",
       " '5.1.1.',\n",
       " '.surface-density',\n",
       " 'dme1+/2+',\n",
       " '19.1.',\n",
       " '-were',\n",
       " 'n.c.',\n",
       " '.89',\n",
       " '.let-7',\n",
       " 'argcys',\n",
       " '.necropsy',\n",
       " 'pyd565579v',\n",
       " '.iarc',\n",
       " '.3,9-11',\n",
       " '~0.01x',\n",
       " '37.01.40',\n",
       " '.frank',\n",
       " '1.8915.065',\n",
       " '.alonso',\n",
       " '.z',\n",
       " 'p.asp1213glyfsx2',\n",
       " '/igh',\n",
       " '.casein',\n",
       " '.taq',\n",
       " 'single-',\n",
       " '29-',\n",
       " '480-',\n",
       " '.calcium-activated',\n",
       " 'trp53/',\n",
       " '.immunofluorescence-infected',\n",
       " 'clinico-',\n",
       " 'cd19+cd27+smigd+',\n",
       " '0.511.60',\n",
       " '3.3.4.',\n",
       " '.mo-nrf2',\n",
       " 'pri-',\n",
       " '21.9/24.3',\n",
       " '.-f.',\n",
       " 'p.ile590phefsx5',\n",
       " '5.30e+04',\n",
       " \"'mixed\",\n",
       " '19,20.',\n",
       " '-beta-gal',\n",
       " 'fip1l1-',\n",
       " ':myc-ddk',\n",
       " 'msh6lq/-',\n",
       " \"'uhp\",\n",
       " '/highwire/powerpoint/42236',\n",
       " '//github.com/boyangzhao/targetid',\n",
       " '.sam',\n",
       " 'asxl1/',\n",
       " ':1761-1769',\n",
       " '.flt3-itds',\n",
       " '.ezh2',\n",
       " '-mannosidase',\n",
       " '-d17s1327',\n",
       " '.low-level',\n",
       " '.egfrrad51',\n",
       " '.rptpp-expressing',\n",
       " '5-cataatgc-',\n",
       " '104586',\n",
       " '.ha-hdmx-e',\n",
       " 'c.9257-10inst',\n",
       " 'karytoype',\n",
       " '/ir',\n",
       " 'c.-h.',\n",
       " '.lower',\n",
       " '.additionally',\n",
       " '.creb1',\n",
       " '|200',\n",
       " '223763/p53/',\n",
       " '.imagej',\n",
       " '.phe958',\n",
       " '1420.',\n",
       " 'nbg21',\n",
       " 'rbf+/+',\n",
       " 'cca-',\n",
       " '.https',\n",
       " 'v241del4.',\n",
       " 'a._',\n",
       " '.bmp',\n",
       " 'zc3h7b-1190f',\n",
       " '-deletion',\n",
       " '.3.0-',\n",
       " '0.52.',\n",
       " '.plasmidsof',\n",
       " '.orientation',\n",
       " '.dusa',\n",
       " '//www.cbioportal.org/',\n",
       " 'c.804-2a',\n",
       " '.appreciable',\n",
       " 'p19arf-',\n",
       " '.anti-c-jun',\n",
       " '+h-',\n",
       " '32.2/12.2',\n",
       " 'mll*',\n",
       " '.tcrv3',\n",
       " '//www.sanger.ac.uk/genetics/cgp/archive/',\n",
       " '.ptpn11t468m/+',\n",
       " '/not',\n",
       " 'e.g.',\n",
       " '.filled',\n",
       " 'ns-',\n",
       " 'b.v.',\n",
       " '.con',\n",
       " 'a*0253n',\n",
       " '.ring',\n",
       " '==',\n",
       " 'g287x',\n",
       " '-29a',\n",
       " 'mitosis/',\n",
       " 'age/',\n",
       " '.2637g',\n",
       " 'l..',\n",
       " '212912_at',\n",
       " '3.3.',\n",
       " '-h3k4me3',\n",
       " '.splice-site',\n",
       " '//www.bloodjournal.org/content/bloodjournal/103/6/2019/f5.large.jpg',\n",
       " '.kmt2a',\n",
       " '.apo/ta',\n",
       " 'atr/+',\n",
       " '419t',\n",
       " '.gain',\n",
       " '.alleleic',\n",
       " '0-1.40e+116',\n",
       " '.hl',\n",
       " '*percentage',\n",
       " 'n95381',\n",
       " 'database*',\n",
       " '1.214.27',\n",
       " '-b2',\n",
       " '-ment',\n",
       " 'f/50',\n",
       " 'met541',\n",
       " '1424.',\n",
       " 'tg+',\n",
       " '_mek1_',\n",
       " '.s6a2',\n",
       " '.identification',\n",
       " '.distances',\n",
       " '~mccormick',\n",
       " 'type-',\n",
       " '.set',\n",
       " '.grossly',\n",
       " '-tubilin',\n",
       " 's241f/+',\n",
       " '//rulai.cshl.edu/cgi-bin/tred/tred.cgi',\n",
       " 'p53*',\n",
       " '804.',\n",
       " '.although',\n",
       " '0.7320.084',\n",
       " '.eukaryote',\n",
       " '.l.m',\n",
       " '3.76104ivs26-20ct',\n",
       " 'gdtp-',\n",
       " '-90',\n",
       " '.5.5',\n",
       " '//dx.doi.org/10.1053/j.gastro.2013.10.020',\n",
       " '.med12-002',\n",
       " '.garcia-rostan',\n",
       " '~280kd',\n",
       " '.rare',\n",
       " '785c',\n",
       " '.cour',\n",
       " 'nfi-1-',\n",
       " '-s',\n",
       " 'c.464t',\n",
       " 'second-degree-',\n",
       " 'bcor-3954r',\n",
       " '.radiotherapy',\n",
       " '.socs1',\n",
       " '.despite',\n",
       " '.malloy1',\n",
       " 'egfr+',\n",
       " 'ckit+',\n",
       " '5867',\n",
       " '.ser46ile',\n",
       " '*9a.30',\n",
       " '.two-month-old',\n",
       " '.srp55',\n",
       " '.captured',\n",
       " '.unstimulated',\n",
       " '.28-31',\n",
       " '0.007a',\n",
       " '.v-gtttggctgaaccatcacag-3',\n",
       " 'b.a.j.p.',\n",
       " '~~~~~~~~~~~~~~~~~~~~~~~~~~~',\n",
       " '.87',\n",
       " 'cin+',\n",
       " 'trp1-exon24r',\n",
       " '-4',\n",
       " 'non-s.',\n",
       " '.cdk4+/+',\n",
       " '-hat',\n",
       " \"'ra-gene\",\n",
       " '.6,10',\n",
       " '.balb/c',\n",
       " '//code.google.com/p/align2rawsignal/',\n",
       " \"'poison\",\n",
       " 'f856la',\n",
       " '.kras-mutant',\n",
       " '.thismay',\n",
       " '.terminal',\n",
       " '.hydrophobic',\n",
       " 'elnaggar',\n",
       " '++++++-_-',\n",
       " '*2403',\n",
       " '=3.01',\n",
       " '.coat',\n",
       " '.nps-1034',\n",
       " '.hypermutation',\n",
       " 'c.2249g',\n",
       " '-fance',\n",
       " '-rt',\n",
       " '.black-color',\n",
       " '0.0136*',\n",
       " '-c-kit',\n",
       " '..4',\n",
       " '.01this',\n",
       " '.1c1c',\n",
       " '.plo',\n",
       " '-cateningenes',\n",
       " 'smad4-/-',\n",
       " '.vegf-a',\n",
       " '.staph',\n",
       " '0.11176',\n",
       " '.biochemicals',\n",
       " 'broad-',\n",
       " '2.3/na',\n",
       " '.lipid',\n",
       " '.fha',\n",
       " '.pdgf-d',\n",
       " '.plot',\n",
       " '-causing',\n",
       " 'fo-',\n",
       " '.reverse-transcription',\n",
       " '6021960160273100',\n",
       " ...}"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(vocab_words1) - set(vocab_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:47:48.631895Z",
     "start_time": "2017-09-23T20:47:48.392338Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18580"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(vocab_words) - set(vocab_words1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:42:23.138563Z",
     "start_time": "2017-09-23T20:42:22.869590Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "362302"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(vocab_words) & set(vocab_words1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:50:28.930648Z",
     "start_time": "2017-09-23T20:50:28.630853Z"
    },
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['stromal-epithelial',\n",
       " 'campen',\n",
       " 'cmv-p300-cha',\n",
       " 'futility',\n",
       " 'midthigh',\n",
       " 'ip-prepared',\n",
       " 'therapyresistant',\n",
       " 'l2654',\n",
       " 'ucp3',\n",
       " 'c57bl6/129',\n",
       " 'alsoa',\n",
       " 'pi3k-akt-mtor',\n",
       " 'signiflcantiy',\n",
       " 'ighv-mutated',\n",
       " 'puhd15-1',\n",
       " 'n74',\n",
       " 'low-risk6',\n",
       " 't241m',\n",
       " 'p21rasgap',\n",
       " 'talk',\n",
       " '314j18',\n",
       " '0.0182',\n",
       " 'leukemias.5,14',\n",
       " '580d',\n",
       " 'synergistic',\n",
       " 'coloectomy',\n",
       " 'c49y',\n",
       " 'slc2a5',\n",
       " 'plcepsilon',\n",
       " '54.256.0',\n",
       " 'characterizes',\n",
       " 'amplificationcodon',\n",
       " 'ligands.the',\n",
       " 'h-100',\n",
       " 'en-v80e',\n",
       " 'slide7',\n",
       " 'visconti',\n",
       " 'psmd3',\n",
       " 'suppressorthat',\n",
       " 'd24e4',\n",
       " 'arg-974',\n",
       " 'anti-p15/cdkn2b',\n",
       " 'side-effect,3',\n",
       " 'analysis7b',\n",
       " '2008a',\n",
       " 'misclassifying',\n",
       " '41a',\n",
       " 'hnf4fl/fl',\n",
       " 'biopsy/plasma',\n",
       " 'nfkbie',\n",
       " '42',\n",
       " 'rotate',\n",
       " 'hypouricemia',\n",
       " 'rhodamine-coupled',\n",
       " '3h2eb',\n",
       " 'p596',\n",
       " 'organizationthe',\n",
       " '10,14,15',\n",
       " 'recapitulates',\n",
       " 'proline/arginine',\n",
       " '235k',\n",
       " 'circumvented',\n",
       " 'thenbe',\n",
       " 'minutes.13',\n",
       " 'e343k',\n",
       " 'lines2,2',\n",
       " 'hd048502',\n",
       " 'chictr-trc-00000397',\n",
       " 'asns.luc',\n",
       " 'tle',\n",
       " '035188',\n",
       " '2f/r',\n",
       " 'yeast-based',\n",
       " 'hadjihannas',\n",
       " 'antigen-specific',\n",
       " 'wtbrca2',\n",
       " '22,24',\n",
       " 'infunium450k',\n",
       " 'cttgacaatgtgtacg',\n",
       " 'genestyrosine',\n",
       " 's1986f',\n",
       " 'rap1gap',\n",
       " 'multi-port',\n",
       " '1ot',\n",
       " 'antivirals',\n",
       " 'strauch/science',\n",
       " 'p39a',\n",
       " 'a4',\n",
       " 'khani-hanjani',\n",
       " '42/60',\n",
       " 'prolongeddouble',\n",
       " 'thr116pro',\n",
       " 't-erg',\n",
       " 'sigmoidoscopy',\n",
       " '10829018',\n",
       " 'u2af35',\n",
       " '51k',\n",
       " 'catalysisthat',\n",
       " 'p.glu227',\n",
       " 'ser15']"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(vocab_words) & set(vocab_words1))[:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:42:35.704978Z",
     "start_time": "2017-09-23T20:42:35.697385Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"ig\" in vocab_words1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:42:32.503344Z",
     "start_time": "2017-09-23T20:42:32.496698Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "283034"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_wordidx[\"ig-\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:49:18.805796Z",
     "start_time": "2017-09-23T20:49:18.798379Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'_cdk4'"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s ='_cdk4'\n",
    "re.sub('^[^a-zA-z0-9]*|[^a-zA-Z0-9]*$','',s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-23T20:57:17.848703Z",
     "start_time": "2017-09-23T20:57:17.840935Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['_cdk4']"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_tokenize('_cd''k4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:dsotc-c3]",
   "language": "python",
   "name": "conda-env-dsotc-c3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {
    "height": "814px",
    "left": "0px",
    "right": "1206px",
    "top": "52px",
    "width": "265px"
   },
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
